{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LXzvg_0MUzhA"
      },
      "source": [
        "## Integrating Unstructured and Graph Knowledge with Neo4j and LangChain for Enhanced Question Answering"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b0AVqZ9XVC9Z"
      },
      "source": [
        "\n",
        "\n",
        "#### Installing Dependencies"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "F52G_upjVIGt",
        "outputId": "9e26d2c5-6294-4c3f-de4f-5a51e8961003"
      },
      "outputs": [],
      "source": [
        "# !pip install -qU \\\n",
        "#        transformers \\\n",
        "#        datasets \\\n",
        "#        langchain \\\n",
        "#        openai \\\n",
        "#        wikipedia \\\n",
        "#        tiktoken \\\n",
        "#        neo4j \\\n",
        "#        python-dotenv"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "B_5pjB2WX_DZ"
      },
      "source": [
        "#### Importing Packanges"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "R-h-iIDmYFGh"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "c:\\Users\\ibm26\\anaconda3\\envs\\pharmagpt\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n",
            "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
          ]
        }
      ],
      "source": [
        "import os\n",
        "import re\n",
        "from langchain.vectorstores.neo4j_vector import Neo4jVector\n",
        "from langchain.document_loaders import WikipediaLoader\n",
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
        "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
        "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
        "from dotenv import load_dotenv"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_JY_gy3BqptG"
      },
      "source": [
        "#### Setting API's in Environment Variable[link text](https://)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "BIHTHxNtYGRN"
      },
      "outputs": [],
      "source": [
        "load_dotenv()\n",
        "# os.environ[\"OPENAI_API_KEY\"] = ''\n",
        "os.environ[\"NEO4J_URI\"] = 'bolt://localhost:7687'\n",
        "os.environ[\"NEO4J_USERNAME\"] = 'neo4j'\n",
        "os.environ[\"NEO4J_PASSWORD\"] = 'docdb@123'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lhhy1qO5orHa",
        "outputId": "a99bdcfc-56e3-43c1-8926-562af2f19e44"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "bolt://localhost:7687\n",
            "neo4j\n",
            "docdb@123\n"
          ]
        }
      ],
      "source": [
        "# print(os.getenv('OPENAI_API_KEY'))\n",
        "print(os.getenv(\"NEO4J_URI\"))\n",
        "print(os.getenv(\"NEO4J_USERNAME\"))\n",
        "print(os.getenv('NEO4J_PASSWORD'))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QiTvOmA8rgeZ"
      },
      "source": [
        "#### Data Preprocessing"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lUF9fXmvYO1h",
        "outputId": "a5b6c25c-5672-455d-e785-4c20148aa013"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Number of tokens: 11\n"
          ]
        }
      ],
      "source": [
        "from transformers import AutoTokenizer\n",
        "\n",
        "# Define the tokenizer using \"bert-base-uncased\"\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
        "\n",
        "# Function to calculate the number of tokens in a text\n",
        "def bert_len(text):\n",
        "    tokens = tokenizer.encode(text)\n",
        "    return len(tokens)\n",
        "\n",
        "# Example usage\n",
        "input_text = \"This is a sample sentence for tokenization.\"\n",
        "num_tokens = bert_len(input_text)\n",
        "print(f\"Number of tokens: {num_tokens}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "from langchain.document_loaders import PyPDFLoader\n",
        "\n",
        "loader = PyPDFLoader(\"./docs/YouCanHaveAnAmazingMemoryLearn.pdf\")\n",
        "pages = loader.load_and_split()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YQqG0o7KbmqZ",
        "outputId": "7b8e48d5-df80-4d1a-d192-11ce439fc62a"
      },
      "outputs": [],
      "source": [
        "# # Load Wikipedia articles related to \"Leonhard Euler\"\n",
        "# raw_documents = WikipediaLoader(query=\"Sachin Tendulkar\").load()\n",
        "\n",
        "# # Define a text splitter with specific parameters\n",
        "# text_splitter = RecursiveCharacterTextSplitter(\n",
        "#     chunk_size=200, chunk_overlap=20, length_function=bert_len, separators=['\\n\\n', '\\n', ' ', '']\n",
        "# )\n",
        "\n",
        "# # Split the content of the first Wikipedia article into smaller documents\n",
        "# documents = text_splitter.create_documents([raw_documents[0].page_content])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "c68Duv2Nbqqk",
        "outputId": "4510be6f-af0c-4c08-e814-da06bf75f7f2"
      },
      "outputs": [],
      "source": [
        "print(len(documents))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Define a text splitter with specific parameters\n",
        "text_splitter = RecursiveCharacterTextSplitter(\n",
        "    chunk_size=1000, chunk_overlap=200, length_function=bert_len, separators=['\\n\\n', '\\n', ' ', '']\n",
        ")\n",
        "\n",
        "# Split the content of the first Wikipedia article into smaller documents\n",
        "documents = text_splitter.create_documents([pages[4].page_content])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "X043ugczr0X5"
      },
      "source": [
        "#### Initializing Graph Database Neo4j [link text](https://)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "RSHWwlbJcCi2"
      },
      "outputs": [],
      "source": [
        "# Instantiate Neo4j vector from documents\n",
        "neo4j_vector = Neo4jVector.from_documents(\n",
        "    documents,\n",
        "    OpenAIEmbeddings(),\n",
        "    url=os.environ[\"NEO4J_URI\"],\n",
        "    username=os.environ[\"NEO4J_USERNAME\"],\n",
        "    password=os.environ[\"NEO4J_PASSWORD\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "34Fm9UgHwWdG"
      },
      "source": [
        "#### Peroforming Similarity Search on Ingested Documents"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qp8bl2hjruzg",
        "outputId": "b5922d9e-7f16-4250-f917-9af03d361fa4"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Chapter 26:\n",
            " Using the tools: Study and learning\n",
            "Chapter 27:\n",
            " Using the tools: Everyday ways to train your\n",
            "memory\n",
            "Chapter 28:\n",
            " Using the tools: Just for fun\n",
            "Chapter 29:\n",
            " Age equals experience, not forgetfulness!\n",
            "Chapter 30:\n",
            " I’ve done all that, now what can I expect?\n",
            "Chapter 31:\n",
            " Look at what you can do now!\n",
            " \n",
            "Afterword: The champions of the future\n",
            " \n",
            "Index\n",
            "\n",
            "Chapter 26:\n",
            " Using the tools: Study and learning\n",
            "Chapter 27:\n",
            " Using the tools: Everyday ways to train your\n",
            "memory\n",
            "Chapter 28:\n",
            " Using the tools: Just for fun\n",
            "Chapter 29:\n",
            " Age equals experience, not forgetfulness!\n",
            "Chapter 30:\n",
            " I’ve done all that, now what can I expect?\n",
            "Chapter 31:\n",
            " Look at what you can do now!\n",
            " \n",
            "Afterword: The champions of the future\n",
            " \n",
            "Index\n"
          ]
        }
      ],
      "source": [
        "# Define the query.\n",
        "query = \"What is the introduction on book?\"\n",
        "\n",
        "# Execute the query, get top 2 results.\n",
        "vector_results = neo4j_vector.similarity_search(query, k=2)\n",
        "\n",
        "# Print search results with separation.\n",
        "for i, res in enumerate(vector_results):\n",
        "    print(res.page_content)\n",
        "    if i != len(vector_results) - 1:\n",
        "        print()\n",
        "\n",
        "# Store the content of the most similar result.\n",
        "vector_result = vector_results[0].page_content"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yLCySJqcxV3W"
      },
      "source": [
        "#### Building Knowledge Graph"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "EOHIAZrLxh8N"
      },
      "outputs": [],
      "source": [
        "# Necessary Libraries to setup the Neo4j DB QuestionAnswering Chain\n",
        "from langchain.chat_models import ChatOpenAI\n",
        "from langchain.chains import GraphCypherQAChain\n",
        "from langchain.graphs import Neo4jGraph"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N_sqke-SzfEQ"
      },
      "outputs": [],
      "source": [
        "# Create a Neo4jGraph object by connecting to a Neo4j database.\n",
        "graph = Neo4jGraph(\n",
        "    url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"docdb@123\"\n",
        ")\n",
        "# from py2neo import Graph\n",
        "# graph = Graph(os.environ[\"NEO4J_URI\"],\n",
        "#               auth = (os.environ[\"NEO4J_USERNAME\"], \n",
        "#                       os.environ[\"NEO4J_PASSWORD\"]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oHFGTHCEz_UD",
        "outputId": "eaf7ba8f-7188-4b5a-f4b3-f786f4a457ae"
      },
      "outputs": [],
      "source": [
        "# Print the schema of the Neo4j graph.\n",
        "print(graph.schema)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wN_9M9fi0OV8"
      },
      "outputs": [],
      "source": [
        "# Create a question-answering chain using GPT-3 and a Neo4j graph, with verbose mode enabled.\n",
        "chain = GraphCypherQAChain.from_llm(\n",
        "    ChatOpenAI(temperature=0.9), graph=graph, verbose=True\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Iw0foDNr0c-v",
        "outputId": "8a776932-7768-4191-d2a4-37c2163b9d2e"
      },
      "outputs": [],
      "source": [
        "# Use the question-answering chain to query the Neo4j graph.\n",
        "graph_result = chain.run(\"What is the book about?\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 37
        },
        "id": "ljHsyK3z0pAf",
        "outputId": "7d95b141-ce27-4404-9e09-1f12c07f3ab8"
      },
      "outputs": [],
      "source": [
        "graph_result"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.18"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
